Introduction

Normalisescoring rates by standardising to score per 40 min (Kubatko, 2007) several models have been attempted (Spector 2020). difficult to predict how 5 opponents will perform WITH each otehr and also against an opposition

Reading and Cleaning the Raw Data

Add standardised team scoring rate per 40min period (Kubatko, 2007), and win%

This is a link to the data fields

Chicago Bulls payroll is 24th or the 30 NBA teams

# Barplot team payrolls
team_sal <- ggplot(data = df_team_payroll, aes(x = reorder(team, COL2), y = COL2)) +
  geom_bar(stat="identity")
# Horizontal bar plot
new <- team_sal + coord_flip(ylim = c(50, 160))
#label axes
new + labs(title = "Team payroll (2018-19)", face = "italics", x = "Team", y = "Millions") 

Win % versus team salary

df_team_WL <- mutate(df_team_WL, winP = (W / (W + L)* 100))
#add win% column
df_team_pts40 <- bind_cols(df_team_pts40, df_team_WL[-c(1:25)])
# team average pts per minute
team_pts40 <- ggplot(data = df_team_pts40, aes(x = reorder(Team, winP), y = winP)) +
  geom_bar(stat="identity")
# Horizontal bar plot
new1 <- team_pts40 + coord_flip(ylim = c(20, 75)) +
  labs(title = "Team game win percentages (2018-19)", x = "Team", y = "Win %")
new1

Chicago Bulls are ranked 29th of 30 in season average points scored per minute played

# team average pts per minute
ggteam_pts40 <- ggplot(data = df_team_pts40, aes(x = reorder(Team, team_pts40), y = team_pts40)) +
  geom_bar(stat="identity")
# Horizontal bar plot
new2 <- ggteam_pts40 + coord_flip(ylim = c(17, 20)) +
  labs(title = "Team average points scored per 40 minutes played (2018-19)", x = "Team", y = "Points / 40 Minutes")
new2

standardise variables to events per 40min

df_team_pts40 <- mutate(df_team_pts40, AST40 = (AST / MP)* 40)
df_team_pts40 <- mutate(df_team_pts40, TOV40 = (TOV / MP)* 40)
df_team_pts40 <- mutate(df_team_pts40, STL40 = (STL / MP)* 40)
df_team_pts40 <- mutate(df_team_pts40, BLK40 = (BLK / MP)* 40)
df_team_pts40 <- mutate(df_team_pts40, PF40 = (PF / MP)* 40)
df_team_pts40 <- mutate(df_team_pts40, TRB40 = (TRB / MP)* 40)
df_team_pts40 <- mutate(df_team_pts40, ORB40 = (ORB / MP)* 40)
df_team_pts40 <- mutate(df_team_pts40, DRB40 = (DRB / MP)* 40)
df_team_pts40 <- mutate(df_team_pts40, FG40 = (FG / MP)* 40)
df_team_pts40 <- mutate(df_team_pts40, FGA40 = (FGA / MP)* 40)

correlation wins and points scored

#points/40 vs win%
df_team_pts40 %>%
  ggplot(aes(x = team_pts40,y = winP)) + 
  geom_point(colour = "dodgerblue") +
  ylim(0, 100) +
  geom_smooth(method = "lm", colour = "magenta") +
  geom_hline(yintercept = 50, colour = "black", linetype = "dashed")
## `geom_smooth()` using formula 'y ~ x'

cor(x = df_team_pts40$team_pts40, y = df_team_pts40$winP, method = "pearson")
## [1] 0.9569494
#assists vs win%
df_team_pts40 %>%
  ggplot(aes(x = AST40,y = winP)) + 
  geom_point(colour = "dodgerblue") +
  ylim(0, 100) +
  geom_smooth(method = "lm", colour = "magenta") +
  geom_hline(yintercept = 50, colour = "black", linetype = "dashed")
## `geom_smooth()` using formula 'y ~ x'

cor(x = df_team_pts40$AST40, y = df_team_pts40$winP, method = "pearson")
## [1] 0.548208
#turnovers vs win%
df_team_pts40 %>%
  ggplot(aes(x = TOV40,y = winP)) + 
  geom_point(colour = "dodgerblue") +
  ylim(0, 100) +
  geom_smooth(method = "lm", colour = "magenta") +
  geom_hline(yintercept = 50, colour = "black", linetype = "dashed")
## `geom_smooth()` using formula 'y ~ x'

cor(x = df_team_pts40$TOV40, y = df_team_pts40$winP, method = "pearson")
## [1] 0.1385349
#steals vs win%
df_team_pts40 %>%
  ggplot(aes(x = STL40,y = winP)) + 
  geom_point(colour = "dodgerblue") +
  ylim(0, 100) +
  geom_smooth(method = "lm", colour = "magenta") +
  geom_hline(yintercept = 50, colour = "black", linetype = "dashed")
## `geom_smooth()` using formula 'y ~ x'

cor(x = df_team_pts40$STL40, y = df_team_pts40$winP, method = "pearson")
## [1] 0.2818981
#blocks vs win%
df_team_pts40 %>%
  ggplot(aes(x = BLK40,y = winP)) + 
  geom_point(colour = "dodgerblue") +
  ylim(0, 100) +
  geom_smooth(method = "lm", colour = "magenta") +
  geom_hline(yintercept = 50, colour = "black", linetype = "dashed")
## `geom_smooth()` using formula 'y ~ x'

cor(x = df_team_pts40$BLK40, y = df_team_pts40$winP, method = "pearson")
## [1] 0.4495809
#fouls vs win%
df_team_pts40 %>%
  ggplot(aes(x = PF40,y = winP)) + 
  geom_point(colour = "dodgerblue") +
  ylim(0, 100) +
  geom_smooth(method = "lm", colour = "magenta") +
  geom_hline(yintercept = 50, colour = "black", linetype = "dashed")
## `geom_smooth()` using formula 'y ~ x'

cor(x = df_team_pts40$PF40, y = df_team_pts40$winP, method = "pearson")
## [1] 0.2266698
#total rebounds vs win%
df_team_pts40 %>%
  ggplot(aes(x = TRB40,y = winP)) + 
  geom_point(colour = "dodgerblue") +
  ylim(0, 100) +
  geom_smooth(method = "lm", colour = "magenta") +
  geom_hline(yintercept = 50, colour = "black", linetype = "dashed")
## `geom_smooth()` using formula 'y ~ x'

cor(x = df_team_pts40$TRB40, y = df_team_pts40$winP, method = "pearson")
## [1] 0.5195445
#ORB vs win%
df_team_pts40 %>%
  ggplot(aes(x = ORB40,y = winP)) + 
  geom_point(colour = "dodgerblue") +
  ylim(0, 100) +
  geom_smooth(method = "lm", colour = "magenta") +
  geom_hline(yintercept = 50, colour = "black", linetype = "dashed")
## `geom_smooth()` using formula 'y ~ x'

cor(x = df_team_pts40$ORB40, y = df_team_pts40$winP, method = "pearson")
## [1] 0.2199931
#DRB vs win%
df_team_pts40 %>%
  ggplot(aes(x = DRB40,y = winP)) + 
  geom_point(colour = "dodgerblue") +
  ylim(0, 100) +
  geom_smooth(method = "lm", colour = "magenta") +
  geom_hline(yintercept = 50, colour = "black", linetype = "dashed")
## `geom_smooth()` using formula 'y ~ x'

cor(x = df_team_pts40$DRB40, y = df_team_pts40$winP, method = "pearson")
## [1] 0.4901269
#FG vs win%
df_team_pts40 %>%
  ggplot(aes(x = FG40,y = winP)) + 
  geom_point(colour = "dodgerblue") +
  ylim(0, 100) +
  geom_smooth(method = "lm", colour = "magenta") +
  geom_hline(yintercept = 50, colour = "black", linetype = "dashed")
## `geom_smooth()` using formula 'y ~ x'

cor(x = df_team_pts40$FG40, y = df_team_pts40$winP, method = "pearson")
## [1] 0.7850829
#FGA vs win%
df_team_pts40 %>%
  ggplot(aes(x = FGA40,y = winP)) + 
  geom_point(colour = "dodgerblue") +
  ylim(0, 100) +
  geom_smooth(method = "lm", colour = "magenta") +
  geom_hline(yintercept = 50, colour = "black", linetype = "dashed")
## `geom_smooth()` using formula 'y ~ x'

cor(x = df_team_pts40$FGA40, y = df_team_pts40$winP, method = "pearson")
## [1] 0.5700753

team stats related to pts40

#AST40 vs pts40
df_team_pts40 %>%
  ggplot(aes(x = AST40,y = team_pts40)) + 
  geom_point(colour = "dodgerblue") +
  geom_smooth(method = "lm", colour = "magenta")
## `geom_smooth()` using formula 'y ~ x'

cor(x = df_team_pts40$AST40, y = df_team_pts40$team_pts40, method = "pearson")
## [1] 0.5823599
#TOV40 vs pts40
df_team_pts40 %>%
  ggplot(aes(x = TOV40,y = team_pts40)) + 
  geom_point(colour = "dodgerblue") +
  geom_smooth(method = "lm", colour = "magenta")
## `geom_smooth()` using formula 'y ~ x'

cor(x = df_team_pts40$TOV40, y = df_team_pts40$team_pts40, method = "pearson")
## [1] 0.06309648
#STL40 vs pts40
df_team_pts40 %>%
  ggplot(aes(x = STL40,y = team_pts40)) + 
  geom_point(colour = "dodgerblue") +
  geom_smooth(method = "lm", colour = "magenta")
## `geom_smooth()` using formula 'y ~ x'

cor(x = df_team_pts40$STL40, y = df_team_pts40$team_pts40, method = "pearson")
## [1] 0.1702733
#BLK40 vs pts40
df_team_pts40 %>%
  ggplot(aes(x = BLK40,y = team_pts40)) + 
  geom_point(colour = "dodgerblue") +
  geom_smooth(method = "lm", colour = "magenta")
## `geom_smooth()` using formula 'y ~ x'

cor(x = df_team_pts40$BLK40, y = df_team_pts40$team_pts40, method = "pearson")
## [1] 0.4192942
#TRB40 vs pts40
df_team_pts40 %>%
  ggplot(aes(x = TRB40,y = team_pts40)) + 
  geom_point(colour = "dodgerblue") +
  geom_smooth(method = "lm", colour = "magenta")
## `geom_smooth()` using formula 'y ~ x'

cor(x = df_team_pts40$TRB40, y = df_team_pts40$team_pts40, method = "pearson")
## [1] 0.5844159
#DRB vs pts40
df_team_pts40 %>%
  ggplot(aes(x = DRB40,y = team_pts40)) + 
  geom_point(colour = "dodgerblue") +
  geom_smooth(method = "lm", colour = "magenta")
## `geom_smooth()` using formula 'y ~ x'

cor(x = df_team_pts40$DRB40, y = df_team_pts40$team_pts40, method = "pearson")
## [1] 0.5767078
#ORB vs pts40
df_team_pts40 %>%
  ggplot(aes(x = ORB40,y = team_pts40)) + 
  geom_point(colour = "dodgerblue") +
  geom_smooth(method = "lm", colour = "magenta")
## `geom_smooth()` using formula 'y ~ x'

cor(x = df_team_pts40$ORB40, y = df_team_pts40$team_pts40, method = "pearson")
## [1] 0.2002259

** choose variables associated with high scoring ** 3 explanatory variables associated with win% and total score 40 are: Points Scored Assists Blocks Total rebounds

Combine player statistics and player salary data files by ‘player names’

# join df_sal & df_pl_stat by "player_name"
df_players <- left_join(x = df_sal, y = df_players, by = c("player_name"))
write.csv(df_players, file = 'data_processed/2018-19_nba_player-salaries-stats_.csv', row.names = FALSE)

Clean processed data file

sum(is.na(df_players)) 
## [1] 1901
naniar::vis_miss(df_players)

#filter out players with no position or game time
df_pl_clean <- drop_na(df_players, Pos)
naniar::vis_miss(df_pl_clean)

Create normalised data by adding variable “points per minute played” Explore the data filter out games less than 10 to minimise outliers

#ggplot(data = df_players) +
# geom_histogram(mapping = aes(x = salmil), colour = "black", fill = "dodgerblue") +
#  labs(x = "millions", y = "number of players", title = "Player salary distribution per year")

df_players <- mutate(df_players, pts40 = (PTS / MP)* 40)

df_players %>%
  group_by(player_id) %>%
  ggplot() +
  geom_histogram(mapping = aes(x = pts40), colour = "black", fill = "dodgerblue") +
  labs(x = "points per minutes played", y = "number of players", title = "Distribution of points scored per minute played", subtitle = "(games played > 10)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 64 rows containing non-finite values (stat_bin).

df_players %>%
  group_by(player_id) %>%
  filter(G > 10) %>%
  ggplot() +
  geom_histogram(mapping = aes(x = pts40), colour = "black", fill = "dodgerblue") +
  labs(x = "points per minutes played", y = "number of players", title = "Distribution of points scored per minute played", subtitle = "(games played > 10)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

merge rows, add column sal.mil

df_players <- mutate(df_players, salmil = salary / 100000)
#combine rows with same player names
df_players <- bind_rows(df_players) %>%
   group_by(player_id, player_name, Pos) %>%
   summarise(salmil = mean(salmil, na.rm = TRUE),
              G = sum(G, na.rm = TRUE),
             MP = sum(MP, na.rm = TRUE),
            TRB = sum(TRB, na.rm = TRUE),
            AST = sum(AST, na.rm = TRUE),
            BLK = sum(BLK, na.rm = TRUE),
            PTS = sum(PTS, na.rm = TRUE))
## `summarise()` has grouped output by 'player_id', 'player_name'. You can override using the `.groups` argument.
df_players40 <- bind_rows(df_players) %>%
   group_by(player_id, player_name) %>%
   summarise(salmil = mean(salmil, na.rm = TRUE),
              G = sum(G, na.rm = TRUE),
             MP = sum(MP, na.rm = TRUE),
            TRB = sum(TRB, na.rm = TRUE),
            AST = sum(AST, na.rm = TRUE),
            BLK = sum(BLK, na.rm = TRUE),
            PTS = sum(PTS, na.rm = TRUE))
## `summarise()` has grouped output by 'player_id'. You can override using the `.groups` argument.
df_players <- mutate(df_players, pts40_ind = (PTS / MP)* 40)
df_players <- mutate(df_players, AST40 = (AST / MP)* 40)
df_players <- mutate(df_players, BLK40 = (BLK / MP)* 40)
df_players <- mutate(df_players, TRB40 = (TRB / MP)* 40)

df_players40 <- mutate(df_players40, pts40_ind = (PTS / MP)* 40)
df_players40 <- mutate(df_players40, AST40 = (AST / MP)* 40)
df_players40 <- mutate(df_players40, BLK40 = (BLK / MP)* 40)
df_players40 <- mutate(df_players40, TRB40 = (TRB / MP)* 40)

look for relationships b/t Player positions and Explanatory Variables

#player group by pts_ind
df_players %>%
  filter(G > 10) %>%
  ggplot() +
  geom_boxplot(mapping = aes(x = reorder(Pos, pts40_ind, FUN = median),
                             y = pts40_ind, colour = reorder(Pos, pts40_ind,
                                                              FUN = median))) +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 45))

#player group by AST
df_players %>%
  filter(G > 10) %>%
  ggplot() +
  geom_boxplot(mapping = aes(x = reorder(Pos, AST40, FUN = median),
                             y = AST40, colour = reorder(Pos, AST40,
                                                              FUN = median))) +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 45))

#player group by BLK
df_players %>%
  filter(G > 10) %>%
  ggplot() +
  geom_boxplot(mapping = aes(x = reorder(Pos, BLK40, FUN = median),
                             y = BLK40, colour = reorder(Pos, BLK40,
                                                              FUN = median))) +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 45))

#player group by TRB
df_players %>%
  filter(G > 10) %>%
  ggplot() +
  geom_boxplot(mapping = aes(x = reorder(Pos, TRB40, FUN = median),
                             y = TRB40, colour = reorder(Pos, TRB40,
                                                              FUN = median))) +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 45))

fit <- lm(winP ~ AST40 + BLK40 + TRB40, data = df_team_pts40)
tidy(fit, conf.int = TRUE)
## # A tibble: 4 x 7
##   term        estimate std.error statistic p.value conf.low conf.high
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 (Intercept)   -125.      47.1     -2.65   0.0134 -222.        -28.1
## 2 AST40           14.1      7.81     1.80   0.0829   -1.97       30.1
## 3 BLK40           17.3     22.1      0.781  0.442   -28.2        62.8
## 4 TRB40           13.8      6.80     2.03   0.0523   -0.149      27.8

** Test Assumptions ** 1. Response variable (team win %) IS continuous 2. Explanatory variables (assists, blocks & total rebounds are continuous integers) 3. Independence: potentially fails due to repeated measures of positions

car::durbinWatsonTest(fit)
##  lag Autocorrelation D-W Statistic p-value
##    1       0.4891626     0.9163006       0
##  Alternative hypothesis: rho != 0
  1. Linear relationship
car::avPlots(fit)

  1. Detect Outliers:
std_res <- rstandard(fit)
points <- 1:length(std_res)
res_labels <- if_else(abs(std_res) >= 2.5, paste(points), "")

ggplot(data = NULL, aes(x = points, y = std_res)) + geom_point() +
geom_text(aes(label = res_labels), nudge_y = 0.3) + ylim(c(-4,4)) +
geom_hline(yintercept = c(-2.5, 2.5), colour = "red", linetype = "dashed")

lm +
geom_text(aes(label = res_labels), nudge_x = 0.002)
## NULL
  1. Cooks distance
cook <- cooks.distance(fit)
cook_labels <- if_else(cook >= 0.015, paste(points), "")

ggplot(data = NULL, aes(x = points, y = cook)) + geom_point() +
geom_text(aes(label = cook_labels), nudge_y = 0.01)

6. Homoscedasticity

 res <- residuals(fit)
fitted <- predict(fit)

ggplot(data = NULL, aes(x = fitted, y = res)) +
  geom_point(colour = "dodgerblue") +
  geom_smooth(se = FALSE, colour = "magenta")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

7. Normally distributed residuals

ggplot(data = NULL, aes(sample = res)) + stat_qq() + stat_qq_line()

  1. Multicollinearity
#team
pairs(formula = ~ winP + AST40 + BLK40 + TRB40, data = df_team_pts40)

fit <- lm(winP ~ AST40 + BLK40 + TRB40, data = df_team_pts40)
tidy(fit, conf.int = TRUE)
## # A tibble: 4 x 7
##   term        estimate std.error statistic p.value conf.low conf.high
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 (Intercept)   -125.      47.1     -2.65   0.0134 -222.        -28.1
## 2 AST40           14.1      7.81     1.80   0.0829   -1.97       30.1
## 3 BLK40           17.3     22.1      0.781  0.442   -28.2        62.8
## 4 TRB40           13.8      6.80     2.03   0.0523   -0.149      27.8
sqrt(car::vif(fit))
##    AST40    BLK40    TRB40 
## 1.247104 1.225111 1.107027
head(predict(fit))
##        1        2        3        4        5        6 
## 66.97234 67.84684 62.32325 62.36924 48.60328 52.65836
#TRB
df_players %>%
filter(G > 10, Pos %in% c("C", "PF", "PG", "SF", "SG")) %>% 
ggplot() +
geom_histogram(aes(x = TRB40, fill = Pos), colour = "black", bins = 40) + facet_wrap(~Pos, nrow = 5)

#assist
df_players %>%
filter(G > 10, Pos %in% c("C", "PF", "PG", "SF", "SG")) %>% 
ggplot() +
geom_histogram(aes(x = AST40, fill = Pos), colour = "black", bins = 40) + facet_wrap(~Pos, nrow = 5)

#pts40
df_players %>%
filter(G > 10, Pos %in% c("C", "PF", "PG", "SF", "SG")) %>% 
ggplot() +
geom_histogram(aes(x = pts40_ind, fill = Pos), colour = "black", bins = 40) + facet_wrap(~Pos, nrow = 5)

# create panels by match_outcome
df_players %>%
  filter(G > 10) %>%
  ggplot() +
  geom_boxplot(mapping = aes(x = reorder(Pos, AST40, FUN = median),
                             y = AST40, colour = reorder(Pos, AST40,
                                                              FUN = median))) +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 45))

#salary by points per minutes played
df_players40 %>% filter(G > 10, PTS > 0) %>% group_by(player_id) %>%
  ggplot(mapping = aes(x = salmil, y = pts40_ind)) + 
  geom_point()

#salary by points per minutes played
df_players %>% filter(G > 10) %>% group_by(player_id) %>%
  ggplot(mapping = aes(x = salmil, y = pts40_ind, colour = Pos)) + 
  geom_point()

#player group by salmil
df_players %>%
  filter(G > 10) %>%
  ggplot() +
  geom_boxplot(mapping = aes(x = reorder(Pos, salmil, FUN = median),
                             y = salmil, colour = reorder(Pos, salmil,
                                                              FUN = median))) +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 45))

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.